Part 1: Data Overview

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(faraway)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
# Read in the data
redwines <- read.csv("redwines.csv")
head(redwines)
##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.4             0.70        0.00            1.9     0.076
## 2           7.8             0.88        0.00            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
## 4          11.2             0.28        0.56            1.9     0.075
## 5           7.4             0.70        0.00            1.9     0.076
## 6           7.4             0.66        0.00            1.8     0.075
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  11                   34  0.9978 3.51      0.56     9.4
## 2                  25                   67  0.9968 3.20      0.68     9.8
## 3                  15                   54  0.9970 3.26      0.65     9.8
## 4                  17                   60  0.9980 3.16      0.58     9.8
## 5                  11                   34  0.9978 3.51      0.56     9.4
## 6                  13                   40  0.9978 3.51      0.56     9.4
##   quality
## 1       5
## 2       5
## 3       5
## 4       6
## 5       5
## 6       5

Full model and Correlation Check

pairs(redwines[,-c(11,12)])

full.model <- lm(alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + free.sulfur.dioxide+ total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
summary(full.model)
## 
## Call:
## lm(formula = alcohol ~ fixed.acidity + volatile.acidity + citric.acid + 
##     residual.sugar + chlorides + free.sulfur.dioxide + total.sulfur.dioxide + 
##     density + pH + sulphates, data = redwines[, -c(12)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.07175 -0.39267 -0.04056  0.35396  2.44365 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.072e+02  1.308e+01  46.419  < 2e-16 ***
## fixed.acidity         5.324e-01  2.064e-02  25.796  < 2e-16 ***
## volatile.acidity      3.608e-01  1.144e-01   3.154 0.001638 ** 
## citric.acid           8.306e-01  1.379e-01   6.024 2.11e-09 ***
## residual.sugar        2.844e-01  1.229e-02  23.135  < 2e-16 ***
## chlorides            -1.462e+00  3.956e-01  -3.696 0.000227 ***
## free.sulfur.dioxide  -2.143e-03  2.057e-03  -1.042 0.297517    
## total.sulfur.dioxide -2.296e-03  6.881e-04  -3.336 0.000868 ***
## density              -6.174e+02  1.342e+01 -45.998  < 2e-16 ***
## pH                    3.762e+00  1.551e-01  24.263  < 2e-16 ***
## sulphates             1.247e+00  1.037e-01  12.020  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.614 on 1588 degrees of freedom
## Multiple R-squared:  0.6701, Adjusted R-squared:  0.668 
## F-statistic: 322.5 on 10 and 1588 DF,  p-value: < 2.2e-16
round(cor(redwines[,-c(11,12)]),dig=2)
##                      fixed.acidity volatile.acidity citric.acid residual.sugar
## fixed.acidity                 1.00            -0.26        0.67           0.11
## volatile.acidity             -0.26             1.00       -0.55           0.00
## citric.acid                   0.67            -0.55        1.00           0.14
## residual.sugar                0.11             0.00        0.14           1.00
## chlorides                     0.09             0.06        0.20           0.06
## free.sulfur.dioxide          -0.15            -0.01       -0.06           0.19
## total.sulfur.dioxide         -0.11             0.08        0.04           0.20
## density                       0.67             0.02        0.36           0.36
## pH                           -0.68             0.23       -0.54          -0.09
## sulphates                     0.18            -0.26        0.31           0.01
##                      chlorides free.sulfur.dioxide total.sulfur.dioxide density
## fixed.acidity             0.09               -0.15                -0.11    0.67
## volatile.acidity          0.06               -0.01                 0.08    0.02
## citric.acid               0.20               -0.06                 0.04    0.36
## residual.sugar            0.06                0.19                 0.20    0.36
## chlorides                 1.00                0.01                 0.05    0.20
## free.sulfur.dioxide       0.01                1.00                 0.67   -0.02
## total.sulfur.dioxide      0.05                0.67                 1.00    0.07
## density                   0.20               -0.02                 0.07    1.00
## pH                       -0.27                0.07                -0.07   -0.34
## sulphates                 0.37                0.05                 0.04    0.15
##                         pH sulphates
## fixed.acidity        -0.68      0.18
## volatile.acidity      0.23     -0.26
## citric.acid          -0.54      0.31
## residual.sugar       -0.09      0.01
## chlorides            -0.27      0.37
## free.sulfur.dioxide   0.07      0.05
## total.sulfur.dioxide -0.07      0.04
## density              -0.34      0.15
## pH                    1.00     -0.20
## sulphates            -0.20      1.00

We observe \(fixed.acidity\) is mildly correlated with \(citric.acid\), \(density\) and \(pH\); \(citric.acid\) is mildly correlated with \(fixed.acidity\), \(volatile.acidity\) and \(pH\); \(free.sulfur.dioxide\) is mildly correlated with \(total.sulfur.dioxide\). We consider removing \(fixed.acidity\), \(citric.acid\) and \(free.sulfur.dioxide\). Let’s look at \(free.sulfur.dioxide\) firstly,

model1 <- lm(alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides +  total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
summary(model1)
## 
## Call:
## lm(formula = alcohol ~ fixed.acidity + volatile.acidity + citric.acid + 
##     residual.sugar + chlorides + total.sulfur.dioxide + density + 
##     pH + sulphates, data = redwines[, -c(12)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.06145 -0.39706 -0.03917  0.34928  2.44848 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.059e+02  1.302e+01  46.535  < 2e-16 ***
## fixed.acidity         5.300e-01  2.051e-02  25.846  < 2e-16 ***
## volatile.acidity      3.809e-01  1.128e-01   3.377 0.000749 ***
## citric.acid           8.548e-01  1.359e-01   6.289 4.12e-10 ***
## residual.sugar        2.827e-01  1.219e-02  23.198  < 2e-16 ***
## chlorides            -1.487e+00  3.949e-01  -3.766 0.000172 ***
## total.sulfur.dioxide -2.775e-03  5.123e-04  -5.416 7.02e-08 ***
## density              -6.160e+02  1.335e+01 -46.125  < 2e-16 ***
## pH                    3.739e+00  1.534e-01  24.369  < 2e-16 ***
## sulphates             1.242e+00  1.036e-01  11.984  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.614 on 1589 degrees of freedom
## Multiple R-squared:  0.6699, Adjusted R-squared:  0.668 
## F-statistic: 358.2 on 9 and 1589 DF,  p-value: < 2.2e-16

Great, \(total.sulfur.dioxide\)’s p-vlaue decreases significantly.

anova(model1, full.model)
## Analysis of Variance Table
## 
## Model 1: alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + 
##     chlorides + total.sulfur.dioxide + density + pH + sulphates
## Model 2: alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + 
##     chlorides + free.sulfur.dioxide + total.sulfur.dioxide + 
##     density + pH + sulphates
##   Res.Df    RSS Df Sum of Sq     F Pr(>F)
## 1   1589 599.11                          
## 2   1588 598.70  1   0.40944 1.086 0.2975

Since F is large, we can drop the \(free.sulfur.dioxide\).

x = model.matrix(model1)[,-1]
dim(x)
## [1] 1599    9
x = x-matrix(apply(x,2,mean),1599,9, byrow=TRUE)
x = x/matrix(apply(x,2,sd),1599,9, byrow=TRUE)
apply(x,2,mean)
##        fixed.acidity     volatile.acidity          citric.acid 
##         3.518207e-16         1.841869e-16        -9.207575e-17 
##       residual.sugar            chlorides total.sulfur.dioxide 
##        -1.156003e-16         8.888973e-17         4.302269e-17 
##              density                   pH            sulphates 
##         2.365840e-14        -2.488013e-17         2.009076e-17
apply(x,2,sd)
##        fixed.acidity     volatile.acidity          citric.acid 
##                    1                    1                    1 
##       residual.sugar            chlorides total.sulfur.dioxide 
##                    1                    1                    1 
##              density                   pH            sulphates 
##                    1                    1                    1
e = eigen(t(x) %*% x)
sqrt(e$val[1]/e$val[9])
## [1] 5.262965

condition number is 5.26 lower than 30, there is no collinearity now.

Then we will use model1 in the following. ## Detect Unusual Observations ### Check High Leverage Points

n=dim(redwines)[1]
p=10
lev=influence(model1)$hat
highlev=lev[lev>2*p/n] #find high leverage points
highlev
##         14         18         20         34         39         43         46 
## 0.02545301 0.02737606 0.02204734 0.02383297 0.01514653 0.02155009 0.01326053 
##         82         84         87         92         93         95         96 
## 0.04430844 0.03069371 0.05546544 0.05546544 0.05719765 0.01359738 0.01480742 
##        107        110        127        128        143        145        152 
## 0.04486258 0.01332999 0.01843085 0.01837401 0.01259396 0.01259396 0.09406530 
##        162        170        182        199        227        241        244 
## 0.01318846 0.03632233 0.01351370 0.01260013 0.03158334 0.01274861 0.02022239 
##        245        259        282        292        325        326        340 
## 0.02022239 0.08429367 0.02774343 0.03931076 0.02792361 0.02792361 0.01614457 
##        355        379        397        401        416        443        452 
## 0.01907500 0.01419726 0.01501122 0.01501122 0.01586589 0.01494878 0.03338733 
##        464        481        495        516        554        555        556 
## 0.01822883 0.06188658 0.01607714 0.01788008 0.02096122 0.01526251 0.01526251 
##        558        592        615        640        650        652        653 
## 0.01568960 0.01635403 0.02503975 0.01551114 0.02214230 0.01553989 0.04606766 
##        673        685        691        693        696        701        711 
## 0.02384996 0.01515557 0.01560221 0.03336782 0.01561996 0.01271687 0.01284710 
##        724        731        755        796        837        838        862 
## 0.04381317 0.03310494 0.03504952 0.01663087 0.01493434 0.01493434 0.03910420 
##        890        912        918        924       1018       1019       1044 
## 0.01262159 0.01897597 0.01622861 0.01622861 0.02527432 0.02527432 0.01598607 
##       1052       1072       1075       1078       1079       1080       1082 
## 0.03402287 0.01353322 0.01353322 0.01294298 0.01294298 0.05381562 0.05682189 
##       1115       1132       1166       1187       1229       1236       1245 
## 0.01921261 0.01312761 0.02530741 0.01546896 0.01266515 0.04301469 0.04763002 
##       1261       1271       1289       1290       1300       1313       1317 
## 0.03297166 0.01351017 0.01670309 0.01670309 0.03341683 0.01460558 0.02260574 
##       1320       1322       1368       1371       1373       1375       1435 
## 0.03897217 0.02066571 0.01296512 0.03548240 0.03548240 0.01763324 0.05831006 
##       1436       1475       1477       1509       1559       1571       1575 
## 0.05831006 0.04381646 0.04381646 0.01361479 0.01592611 0.01330012 0.06009596 
##       1590 
## 0.01271003
halfnorm(lev,6, labs=as.character(1:length(highlev)), ylab="Leverages")

### Check Outliers

StuR=rstudent(model1)
qt(0.05/(2*n),n-p-1)
## [1] -4.176071
sort(abs(StuR),decreasing=TRUE)[1:10]
##      652      560      565      396      354      557      559      494 
## 4.038199 4.018534 4.018534 3.952545 3.858127 3.694477 3.694477 3.670237 
##      500      268 
## 3.670237 3.455404

No outliers

Check High Influential Points

cook=cooks.distance(model1)
max(cook)
## [1] 0.07210158
halfnorm(cook,6,labs=as.character(1:length(cook)),ylab="Cook's distances")

We have some high leverage points, but none of them are high influential. We don’t have outliers.

Check Assumptions

Check Homoscedasticity

plot(model1,which=1)

bptest(model1)
## 
##  studentized Breusch-Pagan test
## 
## data:  model1
## BP = 169.31, df = 9, p-value < 2.2e-16

Homoscedasticity doesn’t hold

Check Normality

plot(model1,which=2)

hist(model1$residuals)

ks.test(residuals(model1), y=pnorm)
## Warning in ks.test(residuals(model1), y = pnorm): ties should not be present for
## the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  residuals(model1)
## D = 0.14307, p-value < 2.2e-16
## alternative hypothesis: two-sided

Normality doesn’t hold.

Tried removing influential points

redwines_clean = redwines[-c(481, 1575),]
model_clean = lm(alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides +  total.sulfur.dioxide + density + pH + sulphates, data=redwines_clean[,-c(12)])
summary(model_clean)
## 
## Call:
## lm(formula = alcohol ~ fixed.acidity + volatile.acidity + citric.acid + 
##     residual.sugar + chlorides + total.sulfur.dioxide + density + 
##     pH + sulphates, data = redwines_clean[, -c(12)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.07939 -0.39264 -0.04045  0.34956  2.48010 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.060e+02  1.294e+01  46.839  < 2e-16 ***
## fixed.acidity         5.241e-01  2.043e-02  25.649  < 2e-16 ***
## volatile.acidity      3.704e-01  1.122e-01   3.302 0.000982 ***
## citric.acid           8.863e-01  1.359e-01   6.521 9.34e-11 ***
## residual.sugar        3.002e-01  1.266e-02  23.712  < 2e-16 ***
## chlorides            -1.540e+00  3.925e-01  -3.924 9.07e-05 ***
## total.sulfur.dioxide -2.935e-03  5.102e-04  -5.752 1.06e-08 ***
## density              -6.160e+02  1.327e+01 -46.419  < 2e-16 ***
## pH                    3.717e+00  1.525e-01  24.370  < 2e-16 ***
## sulphates             1.232e+00  1.030e-01  11.962  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6101 on 1587 degrees of freedom
## Multiple R-squared:  0.6742, Adjusted R-squared:  0.6724 
## F-statistic:   365 on 9 and 1587 DF,  p-value: < 2.2e-16
plot(model_clean,which=1)

bptest(model_clean)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_clean
## BP = 164.35, df = 9, p-value < 2.2e-16
plot(model_clean,which=2)

ks.test(residuals(model_clean), y=pnorm)
## Warning in ks.test(residuals(model_clean), y = pnorm): ties should not be
## present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  residuals(model_clean)
## D = 0.14385, p-value < 2.2e-16
## alternative hypothesis: two-sided

Use Box Cox Transformation

min(redwines$alcohol)
## [1] 8.4

Make sure all alchohol values are positive.

model.transformation=boxcox(model1,lambda=seq(-2, -1, by=0.01))

model.transformation$x[model.transformation$y==max(model.transformation$y)]
## [1] -1.53

\(\lambda\) is -1.53

model_bx=lm((alcohol^(-1.53)-1)/(-1.53) ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides +  total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
summary(model_bx)
## 
## Call:
## lm(formula = (alcohol^(-1.53) - 1)/(-1.53) ~ fixed.acidity + 
##     volatile.acidity + citric.acid + residual.sugar + chlorides + 
##     total.sulfur.dioxide + density + pH + sulphates, data = redwines[, 
##     -c(12)])
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0056555 -0.0010128 -0.0000063  0.0009974  0.0064010 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2.076e+00  3.340e-02  62.155  < 2e-16 ***
## fixed.acidity         1.304e-03  5.260e-05  24.785  < 2e-16 ***
## volatile.acidity      9.833e-04  2.893e-04   3.399 0.000692 ***
## citric.acid           2.060e-03  3.487e-04   5.908 4.23e-09 ***
## residual.sugar        6.800e-04  3.126e-05  21.751  < 2e-16 ***
## chlorides            -4.670e-03  1.013e-03  -4.611 4.33e-06 ***
## total.sulfur.dioxide -8.396e-06  1.314e-06  -6.389 2.18e-10 ***
## density              -1.491e+00  3.426e-02 -43.529  < 2e-16 ***
## pH                    9.233e-03  3.936e-04  23.461  < 2e-16 ***
## sulphates             3.181e-03  2.658e-04  11.970  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001575 on 1589 degrees of freedom
## Multiple R-squared:  0.6509, Adjusted R-squared:  0.649 
## F-statistic: 329.2 on 9 and 1589 DF,  p-value: < 2.2e-16

All predictors are significant. We don’t need to move any predictors. Now let’s check unusual observations and assumptions again.

Check Unusual Observations

Check High Leverages

lev_new=influence(model_bx)$hat
highlev_new=lev_new[lev_new>2*p/n] #find high leverage points
highlev_new
##         14         18         20         34         39         43         46 
## 0.02545301 0.02737606 0.02204734 0.02383297 0.01514653 0.02155009 0.01326053 
##         82         84         87         92         93         95         96 
## 0.04430844 0.03069371 0.05546544 0.05546544 0.05719765 0.01359738 0.01480742 
##        107        110        127        128        143        145        152 
## 0.04486258 0.01332999 0.01843085 0.01837401 0.01259396 0.01259396 0.09406530 
##        162        170        182        199        227        241        244 
## 0.01318846 0.03632233 0.01351370 0.01260013 0.03158334 0.01274861 0.02022239 
##        245        259        282        292        325        326        340 
## 0.02022239 0.08429367 0.02774343 0.03931076 0.02792361 0.02792361 0.01614457 
##        355        379        397        401        416        443        452 
## 0.01907500 0.01419726 0.01501122 0.01501122 0.01586589 0.01494878 0.03338733 
##        464        481        495        516        554        555        556 
## 0.01822883 0.06188658 0.01607714 0.01788008 0.02096122 0.01526251 0.01526251 
##        558        592        615        640        650        652        653 
## 0.01568960 0.01635403 0.02503975 0.01551114 0.02214230 0.01553989 0.04606766 
##        673        685        691        693        696        701        711 
## 0.02384996 0.01515557 0.01560221 0.03336782 0.01561996 0.01271687 0.01284710 
##        724        731        755        796        837        838        862 
## 0.04381317 0.03310494 0.03504952 0.01663087 0.01493434 0.01493434 0.03910420 
##        890        912        918        924       1018       1019       1044 
## 0.01262159 0.01897597 0.01622861 0.01622861 0.02527432 0.02527432 0.01598607 
##       1052       1072       1075       1078       1079       1080       1082 
## 0.03402287 0.01353322 0.01353322 0.01294298 0.01294298 0.05381562 0.05682189 
##       1115       1132       1166       1187       1229       1236       1245 
## 0.01921261 0.01312761 0.02530741 0.01546896 0.01266515 0.04301469 0.04763002 
##       1261       1271       1289       1290       1300       1313       1317 
## 0.03297166 0.01351017 0.01670309 0.01670309 0.03341683 0.01460558 0.02260574 
##       1320       1322       1368       1371       1373       1375       1435 
## 0.03897217 0.02066571 0.01296512 0.03548240 0.03548240 0.01763324 0.05831006 
##       1436       1475       1477       1509       1559       1571       1575 
## 0.05831006 0.04381646 0.04381646 0.01361479 0.01592611 0.01330012 0.06009596 
##       1590 
## 0.01271003
halfnorm(lev_new,6, labs=as.character(1:length(highlev_new)), ylab="Leverages")

### Check Outliers

StuR_new=rstudent(model_bx)
qt(0.05/(2*n),n-p-1)
## [1] -4.176071
sort(abs(StuR_new),decreasing=TRUE)[1:10]
##      652      244      245      494      500      557      559      560 
## 4.116362 3.641438 3.641438 3.605038 3.605038 3.605006 3.605006 3.445001 
##      565      372 
## 3.445001 3.432353

No outliers

Check High Influential Points

cook_new=cooks.distance(model_bx)
max(cook_new)
## [1] 0.07234619
halfnorm(cook_new,6,labs=as.character(1:length(cook_new)),ylab="Cook's distances")

We have some high leverage points, but none of them are high influential. We don’t have outliers.

Check Assumptions

Check Homoscedasticity

plot(model_bx,which=1)

bptest(model_bx)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_bx
## BP = 184.18, df = 9, p-value < 2.2e-16

Homoscedasticity doesn’t hold

Check Normality

plot(model_bx,which=2)

hist(model_bx$residuals)

ks.test(residuals(model_bx), y=pnorm)
## Warning in ks.test(residuals(model_bx), y = pnorm): ties should not be present
## for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  residuals(model_bx)
## D = 0.49774, p-value < 2.2e-16
## alternative hypothesis: two-sided

Normality doesn’t hold. Box-Cox Transformation can’t fix the violation to assumptions.

Check Non-linearity

Added-variables plots

#fixed.acidity
modela1 = lm(alcohol ~ volatile.acidity + citric.acid + residual.sugar + chlorides +  total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
modela2 = lm(fixed.acidity ~ volatile.acidity + citric.acid + residual.sugar + chlorides +  total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
reg.a = lm(modela1$residuals ~ modela2$residuals)
#volatile.acidity
modelb1 = lm(alcohol ~ fixed.acidity + citric.acid + residual.sugar + chlorides +  total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
modelb2 = lm(volatile.acidity ~ fixed.acidity + citric.acid + residual.sugar + chlorides +  total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
reg.b = lm(modelb1$residuals ~ modelb2$residuals)
#citric.acid
modelc1 = lm(alcohol ~ fixed.acidity + volatile.acidity + residual.sugar + chlorides +  total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
modelc2 = lm(citric.acid ~ fixed.acidity + volatile.acidity + residual.sugar + chlorides +  total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
reg.c = lm(modelc1$residuals ~ modelc2$residuals)
#residual.sugar
modeld1 = lm(alcohol ~ fixed.acidity + volatile.acidity + citric.acid + chlorides +  total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
modeld2 = lm(residual.sugar ~ fixed.acidity + volatile.acidity + citric.acid + chlorides +  total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
reg.d = lm(modeld1$residuals ~ modeld2$residuals)
#chlorides
modele1 = lm(alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar +  total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
modele2 = lm(chlorides ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
reg.e = lm(modele1$residuals ~ modele2$residuals)
#total.sulfur.dioxide
modelf1 = lm(alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + density + pH + sulphates, data=redwines[,-c(12)])
modelf2 = lm(total.sulfur.dioxide ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides +  density + pH + sulphates, data=redwines[,-c(12)])
reg.f = lm(modelf1$residuals ~ modelf2$residuals)
#density
modelg1 = lm(alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides +  total.sulfur.dioxide + pH + sulphates, data=redwines[,-c(12)])
modelg2 = lm(density ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides +  total.sulfur.dioxide + pH + sulphates, data=redwines[,-c(12)])
reg.g = lm(modelg1$residuals ~ modelg2$residuals)
#pH
modelh1 = lm(alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides +  total.sulfur.dioxide + density + sulphates, data=redwines[,-c(12)])
modelh2 = lm(pH ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides +  total.sulfur.dioxide + density + sulphates, data=redwines[,-c(12)])
reg.h = lm(modelh1$residuals ~ modelh2$residuals)
#sulphates
modeli1 = lm(alcohol ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides +  total.sulfur.dioxide + density + pH, data=redwines[,-c(12)])
modeli2 = lm(sulphates ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides +  total.sulfur.dioxide + density + pH, data=redwines[,-c(12)])
reg.i = lm(modeli1$residuals ~ modeli2$residuals)
plot(modela2$residuals, modela1$residuals)
abline(reg.a, col="red")

plot(modelb2$residuals, modelb1$residuals)
abline(reg.b, col="red")

plot(modelc2$residuals, modelc1$residuals)
abline(reg.c, col="red")

plot(modeld2$residuals, modeld1$residuals)
abline(reg.d, col="red")

plot(modele2$residuals, modele1$residuals)
abline(reg.e, col="red")

plot(modelf2$residuals, modelf1$residuals)
abline(reg.f, col="red")

plot(modelg2$residuals, modelg1$residuals)
abline(reg.g, col="red")

plot(modelh2$residuals, modelh1$residuals)
abline(reg.h, col="red")

plot(modeli2$residuals, modeli1$residuals)
abline(reg.i, col="red")

All plots except the \(residual.sugar\), we try to transform the \(residual.sugar\).

#residual.sugar
modeld1 = lm(log(alcohol) ~ fixed.acidity + volatile.acidity + citric.acid + chlorides +  total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
modeld2 = lm(log(residual.sugar) ~ fixed.acidity + volatile.acidity + citric.acid + chlorides +  total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
reg.d = lm(modeld1$residuals ~ modeld2$residuals)
plot(modeld2$residuals, modeld1$residuals)
abline(reg.d, col="red")

We can find that \(\log({\text{residual.sugar}})\) works much better. Then, Let’s try the new model!

New Model

modelN=lm(log(alcohol) ~ fixed.acidity + volatile.acidity + citric.acid + log(residual.sugar) + chlorides +  total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
summary(modelN)
## 
## Call:
## lm(formula = log(alcohol) ~ fixed.acidity + volatile.acidity + 
##     citric.acid + log(residual.sugar) + chlorides + total.sulfur.dioxide + 
##     density + pH + sulphates, data = redwines[, -c(12)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.21937 -0.03359 -0.00364  0.03292  0.24246 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.974e+01  1.146e+00  52.119  < 2e-16 ***
## fixed.acidity         4.882e-02  1.776e-03  27.480  < 2e-16 ***
## volatile.acidity      2.479e-02  9.840e-03   2.519 0.011862 *  
## citric.acid           6.561e-02  1.188e-02   5.521 3.93e-08 ***
## log(residual.sugar)   1.238e-01  4.292e-03  28.834  < 2e-16 ***
## chlorides            -1.238e-01  3.448e-02  -3.591 0.000339 ***
## total.sulfur.dioxide -3.102e-04  4.470e-05  -6.939 5.73e-12 ***
## density              -5.930e+01  1.175e+00 -50.477  < 2e-16 ***
## pH                    3.355e-01  1.335e-02  25.128  < 2e-16 ***
## sulphates             1.169e-01  9.030e-03  12.941  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05355 on 1589 degrees of freedom
## Multiple R-squared:  0.7085, Adjusted R-squared:  0.7069 
## F-statistic: 429.1 on 9 and 1589 DF,  p-value: < 2.2e-16

\(volatile.acidity\) isn’t significant. Let’s try to remove it!

modelN2=lm(log(alcohol) ~ fixed.acidity + citric.acid + log(residual.sugar) + chlorides +  total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
summary(modelN2)
## 
## Call:
## lm(formula = log(alcohol) ~ fixed.acidity + citric.acid + log(residual.sugar) + 
##     chlorides + total.sulfur.dioxide + density + pH + sulphates, 
##     data = redwines[, -c(12)])
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.220352 -0.033302 -0.003927  0.032990  0.246753 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.943e+01  1.141e+00  52.064  < 2e-16 ***
## fixed.acidity         4.927e-02  1.770e-03  27.831  < 2e-16 ***
## citric.acid           4.999e-02  1.015e-02   4.923 9.42e-07 ***
## log(residual.sugar)   1.241e-01  4.296e-03  28.895  < 2e-16 ***
## chlorides            -1.017e-01  3.340e-02  -3.044  0.00237 ** 
## total.sulfur.dioxide -2.959e-04  4.441e-05  -6.662 3.72e-11 ***
## density              -5.898e+01  1.170e+00 -50.415  < 2e-16 ***
## pH                    3.374e-01  1.335e-02  25.275  < 2e-16 ***
## sulphates             1.122e-01  8.854e-03  12.673  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05364 on 1590 degrees of freedom
## Multiple R-squared:  0.7073, Adjusted R-squared:  0.7059 
## F-statistic: 480.4 on 8 and 1590 DF,  p-value: < 2.2e-16

Now let’s check unusual observations and assumptions again. ## Check Unusual Observations ### Check High Leverages

p=9
levN=influence(modelN2)$hat
highlevN=levN[levN>2*p/n] #find high leverage points
highlevN
##         14         15         16         18         20         34         43 
## 0.02374118 0.01131942 0.01178307 0.02724947 0.02097420 0.01425524 0.02038424 
##         46         80         82         84         87         89         92 
## 0.01325381 0.01305942 0.04422410 0.03070217 0.05467641 0.01196870 0.05467641 
##         93         96        107        152        162        170        182 
## 0.05618973 0.01473502 0.04488286 0.09288328 0.01367020 0.03578282 0.01327085 
##        202        227        241        244        245        259        282 
## 0.01183720 0.03071777 0.01264097 0.01720205 0.01720205 0.08369923 0.02796887 
##        292        325        326        340        341        355        397 
## 0.03220335 0.01616639 0.01616639 0.01612463 0.01130387 0.01842680 0.01334713 
##        401        416        443        452        464        481        495 
## 0.01334713 0.01537383 0.01257489 0.03219085 0.01425742 0.02350069 0.01303660 
##        516        555        556        558        592        615        640 
## 0.01671361 0.01521687 0.01521687 0.01565473 0.01640204 0.02320278 0.01552666 
##        650        652        653        693        696        724        731 
## 0.01925028 0.01462366 0.04313287 0.03300083 0.01549102 0.04359091 0.03301774 
##        755        796        834        837        838        862        912 
## 0.03514777 0.01144328 0.01259379 0.01488624 0.01488624 0.01783962 0.01375028 
##        918        924        942       1018       1019       1044       1052 
## 0.01264128 0.01264128 0.01162118 0.02534206 0.02534206 0.01134274 0.03472209 
##       1072       1075       1078       1079       1080       1082       1115 
## 0.01127439 0.01127439 0.01319684 0.01319684 0.05208184 0.05507926 0.02084965 
##       1166       1187       1227       1229       1236       1245       1261 
## 0.02508957 0.01321362 0.01192691 0.01183504 0.02194849 0.02310269 0.03172148 
##       1270       1271       1289       1290       1317       1320       1322 
## 0.01234367 0.01245992 0.01627316 0.01627316 0.02253381 0.03522612 0.02066344 
##       1368       1371       1373       1375       1404       1435       1436 
## 0.01237472 0.03388543 0.03388543 0.01839175 0.01184521 0.02385416 0.02385416 
##       1471       1475       1477       1509       1559       1571       1575 
## 0.01258353 0.02005344 0.02005344 0.01357720 0.01639033 0.01347953 0.03627797
halfnorm(levN,6, labs=as.character(1:length(highlevN)), ylab="Leverages")

### Check Outliers

StuRN=rstudent(modelN2)
qt(0.05/(2*n),n-p-1)
## [1] -4.176063
sort(abs(StuRN),decreasing=TRUE)[1:10]
##      652      511      494      500      244      245      560      565 
## 4.664595 4.148651 3.907091 3.907091 3.812396 3.812396 3.810034 3.810034 
##      410      354 
## 3.733976 3.720389

#652 is an outlier.

Check High Influential Points

cookN=cooks.distance(modelN2)
max(cookN)
## [1] 0.03541655
halfnorm(cookN,6,labs=as.character(1:length(cookN)),ylab="Cook's distances")

We have some high leverage points and one outlier, but none of them are high influential.

Check Assumptions

Check Homoscedasticity

plot(modelN2,which=1)

bptest(modelN2)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelN2
## BP = 134.14, df = 8, p-value < 2.2e-16

Homoscedasticity doesn’t hold

Check Normality

plot(modelN2,which=2)

hist(modelN2$residuals)

ks.test(residuals(modelN2), y=pnorm)
## Warning in ks.test(residuals(modelN2), y = pnorm): ties should not be present
## for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  residuals(modelN2)
## D = 0.44287, p-value < 2.2e-16
## alternative hypothesis: two-sided

Normality doesn’t hold. Still can’t fix the violation to assumptions.

We will use Generalized Least Squares next.

Generalized Least Squares

lm.resid=lm(abs(modelN2$residuals)~ fixed.acidity + citric.acid + log(residual.sugar) + chlorides +  total.sulfur.dioxide + density + pH + sulphates, data=redwines[,-c(12)])
summary(lm.resid)
## 
## Call:
## lm(formula = abs(modelN2$residuals) ~ fixed.acidity + citric.acid + 
##     log(residual.sugar) + chlorides + total.sulfur.dioxide + 
##     density + pH + sulphates, data = redwines[, -c(12)])
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.063461 -0.023523 -0.006196  0.016038  0.194110 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           8.933e-02  7.160e-01   0.125 0.900725    
## fixed.acidity         5.539e-03  1.110e-03   4.988 6.75e-07 ***
## citric.acid           2.473e-03  6.369e-03   0.388 0.697830    
## log(residual.sugar)   9.709e-03  2.695e-03   3.603 0.000325 ***
## chlorides            -2.636e-02  2.095e-02  -1.258 0.208466    
## total.sulfur.dioxide  1.251e-05  2.786e-05   0.449 0.653363    
## density              -2.079e-01  7.338e-01  -0.283 0.777003    
## pH                    3.057e-02  8.374e-03   3.651 0.000270 ***
## sulphates             6.166e-03  5.554e-03   1.110 0.267046    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03364 on 1590 degrees of freedom
## Multiple R-squared:  0.06416,    Adjusted R-squared:  0.05945 
## F-statistic: 13.63 on 8 and 1590 DF,  p-value: < 2.2e-16
redwines$weight = 1/lm.resid$fitted.values^2
head(redwines)
##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.4             0.70        0.00            1.9     0.076
## 2           7.8             0.88        0.00            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
## 4          11.2             0.28        0.56            1.9     0.075
## 5           7.4             0.70        0.00            1.9     0.076
## 6           7.4             0.66        0.00            1.8     0.075
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  11                   34  0.9978 3.51      0.56     9.4
## 2                  25                   67  0.9968 3.20      0.68     9.8
## 3                  15                   54  0.9970 3.26      0.65     9.8
## 4                  17                   60  0.9980 3.16      0.58     9.8
## 5                  11                   34  0.9978 3.51      0.56     9.4
## 6                  13                   40  0.9978 3.51      0.56     9.4
##   quality   weight
## 1       5 680.4699
## 2       5 821.0768
## 3       5 797.5032
## 4       6 392.0977
## 5       5 680.4699
## 6       5 695.7573
ModelGLS = lm(log(alcohol) ~ fixed.acidity + citric.acid + log(residual.sugar) + chlorides +  total.sulfur.dioxide + density + pH + sulphates, data=redwines, weights=weight)
summary(ModelGLS)
## 
## Call:
## lm(formula = log(alcohol) ~ fixed.acidity + citric.acid + log(residual.sugar) + 
##     chlorides + total.sulfur.dioxide + density + pH + sulphates, 
##     data = redwines, weights = weight)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1487 -0.8459 -0.0803  0.7985  4.9741 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.324e+01  1.040e+00  60.788  < 2e-16 ***
## fixed.acidity         5.170e-02  1.736e-03  29.781  < 2e-16 ***
## citric.acid           4.506e-02  9.327e-03   4.831 1.49e-06 ***
## log(residual.sugar)   1.258e-01  4.370e-03  28.784  < 2e-16 ***
## chlorides            -1.114e-01  2.447e-02  -4.552 5.72e-06 ***
## total.sulfur.dioxide -2.429e-04  4.076e-05  -5.959 3.12e-09 ***
## density              -6.283e+01  1.067e+00 -58.909  < 2e-16 ***
## pH                    3.355e-01  1.196e-02  28.057  < 2e-16 ***
## sulphates             1.201e-01  8.004e-03  15.004  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.288 on 1590 degrees of freedom
## Multiple R-squared:  0.7581, Adjusted R-squared:  0.7569 
## F-statistic:   623 on 8 and 1590 DF,  p-value: < 2.2e-16

Detect Unusual Observations

Check High Leverage Points

p=9
lev=influence(ModelGLS)$hat
highlev=lev[lev>2*p/n] #find high leverage points
highlev
##         14         18         20         35         39         43         46 
## 0.02406689 0.03006787 0.02615795 0.02197022 0.01584188 0.02060839 0.01306117 
##         50         80         82         84         87         92         93 
## 0.03057213 0.01480537 0.04985504 0.04472050 0.05556373 0.05556373 0.05514656 
##         95         96        107        143        145        146        148 
## 0.01377799 0.01419120 0.06185484 0.01315654 0.01315654 0.01173450 0.01501330 
##        152        162        170        199        202        227        231 
## 0.09704776 0.02279010 0.06431675 0.01678839 0.01127107 0.02461942 0.01228704 
##        241        243        259        282        292        355        391 
## 0.01212927 0.01360380 0.18483615 0.01999847 0.01599239 0.03899989 0.01293439 
##        397        401        440        452        464        554        589 
## 0.01151761 0.01151761 0.01968041 0.03747609 0.01798278 0.01364680 0.01235031 
##        592        615        640        650        693        696        724 
## 0.03424956 0.02587062 0.01214951 0.01915306 0.03219727 0.01662860 0.06696009 
##        731        755        803        822        837        838        862 
## 0.02353455 0.05093519 0.01457065 0.01224451 0.01794220 0.01794220 0.01686589 
##        917       1018       1019       1052       1080       1082       1086 
## 0.01441685 0.08863633 0.08863633 0.04277325 0.03560511 0.03729104 0.01507573 
##       1099       1115       1127       1132       1158       1166       1227 
## 0.01134655 0.02341700 0.01426647 0.01780662 0.01344471 0.02613311 0.01822012 
##       1229       1236       1241       1245       1261       1270       1271 
## 0.01441523 0.01966429 0.01129933 0.02027381 0.03196594 0.01726420 0.01644981 
##       1289       1290       1296       1297       1299       1317       1320 
## 0.01307448 0.01307448 0.01256181 0.01256181 0.01143870 0.02007992 0.03801925 
##       1322       1368       1371       1373       1375       1390       1393 
## 0.02058252 0.01474052 0.03750807 0.03750807 0.04764642 0.01880954 0.01587503 
##       1404       1457       1471       1476       1478       1483       1509 
## 0.01604786 0.01205987 0.01813317 0.01265119 0.01265119 0.01291716 0.01512627 
##       1559       1567       1571       1575       1590 
## 0.01795038 0.01255547 0.01490928 0.02931215 0.01177936
halfnorm(lev,6, labs=as.character(1:length(highlev)), ylab="Leverages")

### Check Outliers

StuR=rstudent(ModelGLS)
qt(0.05/(2*n),n-p-1)
## [1] -4.176063
sort(abs(StuR),decreasing=TRUE)[1:10]
##     1018     1019     1427      652      494      500      654      727 
## 4.063682 4.063682 3.881344 3.777157 3.353329 3.353329 3.309287 3.243646 
##     1234      372 
## 3.234305 3.190007

No ouliers

Check High Influential Points

cook=cooks.distance(ModelGLS)
sort(abs(cook),decreasing=TRUE)[1:10]
##       1018       1019        152        227        837        838        259 
## 0.17672589 0.17672589 0.02577125 0.01996817 0.01929981 0.01929981 0.01859494 
##       1320       1115       1132 
## 0.01721969 0.01494359 0.01463477
halfnorm(cook,6,labs=as.character(1:length(cook)),ylab="Cook's distances")

We have some high leverage points, but none of them are high influential. No outliers.

Check Assumptions

Check Homoscedasticity

plot(ModelGLS,which=1)

bptest(ModelGLS)
## 
##  studentized Breusch-Pagan test
## 
## data:  ModelGLS
## BP = 134.14, df = 8, p-value < 2.2e-16

Homoscedasticity doesn’t hold

Check Normality

plot(ModelGLS,which=2)

hist(ModelGLS$residuals)

ks.test(residuals(ModelGLS), y=pnorm)
## Warning in ks.test(residuals(ModelGLS), y = pnorm): ties should not be present
## for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  residuals(ModelGLS)
## D = 0.44265, p-value < 2.2e-16
## alternative hypothesis: two-sided

Still can’t fix the violation to assumptions.